import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import to_hex
import seaborn as sns
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
df = pd.read_csv('C:/Users/Lympha/Desktop/temp_dir/result_dataframes/pyrosetta_vanderwaals1_dataframe.csv')
print(df.info)
<bound method DataFrame.info of Unnamed: 0 pos1:M pos2:T pos3:E pos4:Y pos5:K pos6:L \
0 1A2B -4.634121 -4.146348 -6.905588 NaN -9.122749 -7.232380
1 1AA9 -3.042625 -4.531765 -5.041186 -10.754950 -7.986250 -10.842031
2 1AGP -4.445643 -4.050015 -4.136845 -9.588897 -5.769936 -7.571611
3 1AM4 NaN NaN NaN -4.634601 NaN NaN
4 1AN0 -3.005889 NaN NaN NaN NaN NaN
.. ... ... ... ... ... ... ...
376 8DNJ -3.826750 -4.097547 -3.945441 -9.479352 -5.985502 -7.596897
377 8EBZ NaN NaN NaN NaN NaN -0.419511
378 8EZG -3.986546 -3.816575 -3.409309 -9.352288 -7.078221 -8.010134
379 8F0M NaN NaN -0.344468 NaN NaN NaN
380 8IJ9 -0.591866 -3.584155 -2.763765 -4.392556 -3.831499 -8.309540
pos7:V pos8:V pos9:V ... pos180:G pos181:C pos182:M \
0 -6.795031 -8.118857 -6.478405 ... NaN NaN NaN
1 -10.338247 -8.625854 -8.293056 ... NaN NaN NaN
2 -7.344035 -6.534242 -6.874382 ... NaN NaN NaN
3 -4.134701 -2.325374 -7.163531 ... NaN NaN NaN
4 NaN NaN NaN ... NaN NaN NaN
.. ... ... ... ... ... ... ...
376 -7.042336 -6.848072 -6.283040 ... NaN NaN NaN
377 NaN NaN NaN ... NaN NaN NaN
378 -7.142634 -6.316100 -6.427623 ... NaN NaN NaN
379 NaN NaN NaN ... NaN NaN NaN
380 -7.882243 -9.378755 -8.349998 ... NaN NaN NaN
pos183:S pos184:C pos185:K pos186:C pos187:V pos188:L pos189:S
0 NaN NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN NaN NaN
.. ... ... ... ... ... ... ...
376 NaN NaN NaN NaN NaN NaN NaN
377 NaN NaN NaN NaN NaN NaN NaN
378 NaN NaN NaN NaN NaN NaN NaN
379 NaN NaN NaN NaN NaN NaN NaN
380 NaN NaN NaN NaN NaN NaN NaN
[381 rows x 190 columns]>
print(df.head)
<bound method NDFrame.head of Unnamed: 0 pos1:M pos2:T pos3:E pos4:Y pos5:K pos6:L \
0 1A2B -4.634121 -4.146348 -6.905588 NaN -9.122749 -7.232380
1 1AA9 -3.042625 -4.531765 -5.041186 -10.754950 -7.986250 -10.842031
2 1AGP -4.445643 -4.050015 -4.136845 -9.588897 -5.769936 -7.571611
3 1AM4 NaN NaN NaN -4.634601 NaN NaN
4 1AN0 -3.005889 NaN NaN NaN NaN NaN
.. ... ... ... ... ... ... ...
376 8DNJ -3.826750 -4.097547 -3.945441 -9.479352 -5.985502 -7.596897
377 8EBZ NaN NaN NaN NaN NaN -0.419511
378 8EZG -3.986546 -3.816575 -3.409309 -9.352288 -7.078221 -8.010134
379 8F0M NaN NaN -0.344468 NaN NaN NaN
380 8IJ9 -0.591866 -3.584155 -2.763765 -4.392556 -3.831499 -8.309540
pos7:V pos8:V pos9:V ... pos180:G pos181:C pos182:M \
0 -6.795031 -8.118857 -6.478405 ... NaN NaN NaN
1 -10.338247 -8.625854 -8.293056 ... NaN NaN NaN
2 -7.344035 -6.534242 -6.874382 ... NaN NaN NaN
3 -4.134701 -2.325374 -7.163531 ... NaN NaN NaN
4 NaN NaN NaN ... NaN NaN NaN
.. ... ... ... ... ... ... ...
376 -7.042336 -6.848072 -6.283040 ... NaN NaN NaN
377 NaN NaN NaN ... NaN NaN NaN
378 -7.142634 -6.316100 -6.427623 ... NaN NaN NaN
379 NaN NaN NaN ... NaN NaN NaN
380 -7.882243 -9.378755 -8.349998 ... NaN NaN NaN
pos183:S pos184:C pos185:K pos186:C pos187:V pos188:L pos189:S
0 NaN NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN NaN NaN
.. ... ... ... ... ... ... ...
376 NaN NaN NaN NaN NaN NaN NaN
377 NaN NaN NaN NaN NaN NaN NaN
378 NaN NaN NaN NaN NaN NaN NaN
379 NaN NaN NaN NaN NaN NaN NaN
380 NaN NaN NaN NaN NaN NaN NaN
[381 rows x 190 columns]>
metadata_df = pd.read_csv('C:/Users/Lympha/Desktop/temp_dir/result_dataframes/metadata_dataframe.csv')
metadata_df.head()
| Unnamed: 0 | Title | Structure Details | Source Organism | Taxonomy ID | Abstract | Method | Resolution | Original Number of Models | Original Number of Chains | ... | Number of ILE | Number of GLN | Number of ASN | Number of HIS | Number of PHE | Number of ASP | Number of PRO | Number of ARG | Number of CYS | Number of TRP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1A2B | HUMAN RHOA COMPLEXED WITH GTP ANALOGUE | NaN | Homo sapiens | 9606 | The 2.4-A resolution crystal structure of a do... | x-ray diffraction | 2.4 | 1 | 1 | ... | 10 | 5 | 6 | 2.0 | 7 | 15 | 11.0 | 10 | 5.0 | 2.0 |
| 1 | 1AA9 | HUMAN C-HA-RAS(1-171)(DOT)GDP, NMR, MINIMIZED ... | NaN | Homo sapiens | 9606 | The backbone 1H, 13C, and 15N resonances of th... | solution nmr | NaN | 1 | 1 | ... | 11 | 11 | 4 | 3.0 | 5 | 14 | 3.0 | 12 | 3.0 | NaN |
| 2 | 1AGP | THREE-DIMENSIONAL STRUCTURES AND PROPERTIES OF... | C-H-RAS P21 PROTEIN MUTANT WITH GLY 12 REPLACE... | Homo sapiens | 9606 | The three-dimensional structures and biochemic... | x-ray diffraction | 2.3 | 1 | 1 | ... | 11 | 11 | 4 | 3.0 | 5 | 15 | 3.0 | 11 | 3.0 | NaN |
| 3 | 1AM4 | COMPLEX BETWEEN CDC42HS.GMPPNP AND P50 RHOGAP ... | NaN | Homo sapiens | 9606 | Small G proteins transduce signals from plasma... | x-ray diffraction | 2.7 | 1 | 6 | ... | 8 | 6 | 5 | 2.0 | 8 | 11 | 12.0 | 5 | 5.0 | 1.0 |
| 4 | 1AN0 | CDC42HS-GDP COMPLEX | NaN | Homo sapiens | 9606 | No DOI found | x-ray diffraction | 2.8 | 1 | 2 | ... | 8 | 6 | 5 | 2.0 | 8 | 11 | 15.0 | 6 | 6.0 | 1.0 |
5 rows × 42 columns
plt.figure(figsize=(100,70))
sns.heatmap(df.drop(columns=['Unnamed: 0']), cmap='viridis')
plt.title('Attractive Van der Waals on HRAS Experimental Structures')
plt.xlabel('Metric')
plt.ylabel('Structures')
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
plt.figure(figsize=(15,10))
sns.histplot(df.drop(columns=['Unnamed: 0']).values.flatten(), bins=50, kde=True)
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Distribution of Values in Attractive Van der Waals Dataframe')
plt.show()
nan_percentage = df.isnull().mean() * 100
plt.figure(figsize=(100,70))
sns.barplot(x=nan_percentage.index, y=nan_percentage.values)
plt.xticks(rotation=90)
plt.ylabel('Percentage of NaN values')
plt.title('Percentage of NaN values in each column of Attractive Van der Waals')
plt.show()
merged_nonnorm_df = pd.merge(df, metadata_df[['Unnamed: 0', 'Read Activity Status']], on='Unnamed: 0')
melted_nonnorm = pd.melt(merged_nonnorm_df, id_vars=['Unnamed: 0', 'Read Activity Status'], var_name='Amino Acid Position', value_name='Value')
plt.figure(figsize=(50,40))
sns.violinplot(x="Amino Acid Position", y="Value", hue="Read Activity Status", data=melted_nonnorm, split=True, inner="quart", palette={"active": "red", "inactive": "blue"})
plt.xticks(rotation=90)
plt.title('Violin Plot for All Amino Acid Positions')
plt.xlabel('Amino Acid Position')
plt.ylabel('Value')
plt.legend(title='Read Activity Status')
plt.tight_layout()
plt.show()
protein_codes = df['Unnamed: 0']
feature_df_numeric = df.drop(columns=['Unnamed: 0'])
scaler = StandardScaler()
feature_normalized = scaler.fit_transform(feature_df_numeric)
feature_normalized_df = pd.DataFrame(feature_normalized, columns=feature_df_numeric.columns)
feature_normalized_df.fillna(0, inplace=True)
C:\Users\Lympha\anaconda3\lib\site-packages\sklearn\utils\extmath.py:985: RuntimeWarning: invalid value encountered in true_divide updated_mean = (last_sum + new_sum) / updated_sample_count C:\Users\Lympha\anaconda3\lib\site-packages\sklearn\utils\extmath.py:990: RuntimeWarning: invalid value encountered in true_divide T = new_sum / new_sample_count C:\Users\Lympha\anaconda3\lib\site-packages\sklearn\utils\extmath.py:1020: RuntimeWarning: invalid value encountered in true_divide new_unnormalized_variance -= correction ** 2 / new_sample_count
plt.figure(figsize=(100,70))
sns.heatmap(feature_normalized_df, cmap="YlGnBu", cbar_kws={'label': 'Z-score'})
plt.title('Heatmap of Normalized Attractive Van der Waals dataframe')
plt.show()
plt.figure(figsize=(15,10))
sns.histplot(feature_normalized_df.values.flatten(), bins=50, kde=True)
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Distribution of Values in Attractive Van der Waals dataframe')
plt.show()
merged_df = pd.merge(feature_normalized_df, metadata_df, left_on=protein_codes, right_on="Unnamed: 0")
X = feature_normalized_df
y = metadata_df["Read Activity Status"]
y_factorized = pd.factorize(y)[0]
merged_df.head()
| pos1:M | pos2:T | pos3:E | pos4:Y | pos5:K | pos6:L | pos7:V | pos8:V | pos9:V | pos10:G | ... | Number of ILE | Number of GLN | Number of ASN | Number of HIS | Number of PHE | Number of ASP | Number of PRO | Number of ARG | Number of CYS | Number of TRP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.976168 | -0.370058 | -1.348450 | 0.000000 | -1.156894 | 0.208173 | 0.630605 | -0.633860 | 1.008364 | 0.095160 | ... | 10 | 5 | 6 | 2.0 | 7 | 15 | 11.0 | 10 | 5.0 | 2.0 |
| 1 | -0.026606 | -0.647363 | -0.310813 | -1.325042 | -0.510752 | -2.233082 | -2.582462 | -1.091958 | -1.171333 | 0.247974 | ... | 11 | 11 | 4 | 3.0 | 5 | 14 | 3.0 | 12 | 3.0 | NaN |
| 2 | -0.863714 | -0.300747 | 0.192501 | -0.843519 | 0.749304 | -0.021253 | 0.132756 | 0.797923 | 0.532730 | 0.486202 | ... | 11 | 11 | 4 | 3.0 | 5 | 15 | 3.0 | 11 | 3.0 | NaN |
| 3 | 0.000000 | 0.000000 | 0.000000 | 1.202366 | 0.000000 | 0.000000 | 3.043052 | 4.600855 | 0.185414 | 0.000000 | ... | 8 | 6 | 5 | 2.0 | 8 | 11 | 12.0 | 5 | 5.0 | 1.0 |
| 4 | -0.004688 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.809350 | ... | 8 | 6 | 5 | 2.0 | 8 | 11 | 15.0 | 6 | 6.0 | 1.0 |
5 rows × 231 columns
melted_data = pd.melt(X.iloc[:, :len(feature_normalized_df.columns)], value_vars=X.iloc[:, :len(feature_normalized_df.columns)].columns)
melted_data['Activity Status'] = np.tile(y, len(X.columns[:len(feature_normalized_df.columns)]))
plt.figure(figsize=(50,40))
sns.violinplot(x="variable", y="value", hue="Activity Status", data=melted_data, split=True, inner="quart", palette={"active": "red", "inactive": "blue"})
plt.xticks(rotation=90)
plt.title('Violin Plot for All Amino Acid Positions')
plt.xlabel('Amino Acid Position')
plt.ylabel('Value')
plt.legend(title='Activity Status')
plt.tight_layout()
plt.show()
correlation_matrix = X.iloc[:, :len(feature_normalized_df.columns)].corr()
correlation_with_target = X.iloc[:, :len(feature_normalized_df.columns)].apply(lambda x: x.corr(pd.Series(y_factorized)))
plt.figure(figsize=(100, 70))
sns.heatmap(correlation_matrix, cmap="coolwarm", vmin=-1, vmax=1, cbar_kws={'label': 'Correlation'})
plt.title('Correlation Matrix of Amino Acid Positions')
plt.show()
correlation_with_target_abs = correlation_with_target.abs().sort_values(ascending=False)
correlation_with_target_sorted = correlation_with_target[correlation_with_target_abs.index]
correlation_with_target_sorted.head(10)
pos33:D 0.385023 pos64:Y -0.310513 pos72:M -0.276453 pos166:H 0.253190 pos165:Q 0.245898 pos9:V -0.206360 pos106:S 0.202195 pos143:E -0.197152 pos39:S 0.192843 pos16:K -0.192773 dtype: float64
linked = linkage(feature_normalized, method='ward')
color_map = {
"active": "red",
"inactive": "blue"
}
labels = df["Unnamed: 0"].values
plt.figure(figsize=(20,15))
dendro_data = dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True, labels=labels)
ax = plt.gca()
xlbls = ax.get_xmajorticklabels()
for lbl in xlbls:
structure_id = lbl.get_text()
color = color_map[merged_df[merged_df["Unnamed: 0"] == structure_id]["Read Activity Status"].values[0]]
lbl.set_color(color)
plt.title('Full Hierarchical Clustering Dendrogram for Attractive Van der Waals dataframe (Colored by Activation Status)')
plt.xlabel('Protein Structure Names')
plt.ylabel('Distance (Ward)')
plt.xticks(rotation=90) # Rotate x-axis labels for better readability
plt.show()
merged_df = pd.merge(feature_normalized_df, metadata_df, left_on=protein_codes, right_on="Unnamed: 0")
X = feature_normalized_df
y = merged_df["Read Activity Status"]
merged_df.head()
| pos1:M | pos2:T | pos3:E | pos4:Y | pos5:K | pos6:L | pos7:V | pos8:V | pos9:V | pos10:G | ... | Number of ILE | Number of GLN | Number of ASN | Number of HIS | Number of PHE | Number of ASP | Number of PRO | Number of ARG | Number of CYS | Number of TRP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.976168 | -0.370058 | -1.348450 | 0.000000 | -1.156894 | 0.208173 | 0.630605 | -0.633860 | 1.008364 | 0.095160 | ... | 10 | 5 | 6 | 2.0 | 7 | 15 | 11.0 | 10 | 5.0 | 2.0 |
| 1 | -0.026606 | -0.647363 | -0.310813 | -1.325042 | -0.510752 | -2.233082 | -2.582462 | -1.091958 | -1.171333 | 0.247974 | ... | 11 | 11 | 4 | 3.0 | 5 | 14 | 3.0 | 12 | 3.0 | NaN |
| 2 | -0.863714 | -0.300747 | 0.192501 | -0.843519 | 0.749304 | -0.021253 | 0.132756 | 0.797923 | 0.532730 | 0.486202 | ... | 11 | 11 | 4 | 3.0 | 5 | 15 | 3.0 | 11 | 3.0 | NaN |
| 3 | 0.000000 | 0.000000 | 0.000000 | 1.202366 | 0.000000 | 0.000000 | 3.043052 | 4.600855 | 0.185414 | 0.000000 | ... | 8 | 6 | 5 | 2.0 | 8 | 11 | 12.0 | 5 | 5.0 | 1.0 |
| 4 | -0.004688 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.809350 | ... | 8 | 6 | 5 | 2.0 | 8 | 11 | 15.0 | 6 | 6.0 | 1.0 |
5 rows × 231 columns
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X.iloc[:, :len(feature_normalized_df.columns)])
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])
pca_df['Activity Status'] = y
explained_variance = pca.explained_variance_ratio_
plt.figure(figsize=(10, 7))
sns.scatterplot(x='Principal Component 1', y='Principal Component 2', hue='Activity Status', data=pca_df, palette={"active": "red", "inactive": "blue"})
plt.xlabel(f'Principal Component 1 ({explained_variance[0]*100:.2f}%)')
plt.ylabel(f'Principal Component 2 ({explained_variance[1]*100:.2f}%)')
plt.title('2D PCA of Amino Acid Positions')
plt.show()
pca_3d = PCA(n_components=3)
principal_components_3d = pca_3d.fit_transform(X)
pca_df_3d = pd.DataFrame(data=principal_components_3d, columns=['Principal Component 1', 'Principal Component 2', 'Principal Component 3'])
pca_df_3d['Activity Status'] = y
colors = {'inactive': 'blue', 'active': 'red'}
explained_variance_3d = pca_3d.explained_variance_ratio_
fig = plt.figure(figsize=(15,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pca_df_3d['Principal Component 1'], pca_df_3d['Principal Component 2'], pca_df_3d['Principal Component 3'], c=pca_df_3d["Activity Status"].map(colors), s=50, label=pca_df_3d["Activity Status"].unique())
ax.set_xlabel(f'Principal Component 1 ({explained_variance_3d[0]*100:.2f}%)')
ax.set_ylabel(f'Principal Component 2 ({explained_variance_3d[1]*100:.2f}%)')
ax.set_zlabel(f'Principal Component 3 ({explained_variance_3d[2]*100:.2f}%)')
ax.set_title('3D PCA of Amino Acid Positions')
legend_handles = [plt.Line2D([0], [0], marker='o', color='w', label=status, markersize=10, markerfacecolor=colors[status]) for status in colors]
ax.legend(handles=legend_handles, title='Activity Status')
plt.show()
tsne = TSNE(n_components=2, random_state=1)
tsne_2d = tsne.fit_transform(X)
tsne_df = pd.DataFrame(data=tsne_2d, columns=['t-SNE 1', 't-SNE 2'])
tsne_df['Activity Status'] = y
plt.figure(figsize=(10, 7))
sns.scatterplot(x='t-SNE 1', y='t-SNE 2', hue='Activity Status', data=tsne_df, palette={"active": "red", "inactive": "blue"})
plt.title('t-SNE Projection of Attractive Van der Waals Data')
plt.show()
C:\Users\Lympha\anaconda3\lib\site-packages\sklearn\manifold\_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. warnings.warn( C:\Users\Lympha\anaconda3\lib\site-packages\sklearn\manifold\_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2. warnings.warn(
label_encoder = LabelEncoder()
y_factorized = label_encoder.fit_transform(y)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=1)
rf_clf.fit(X.iloc[:, :len(feature_normalized_df.columns)], y_factorized)
feature_importances = rf_clf.feature_importances_
importance_df = pd.DataFrame({
'Amino Acid Position': X.columns[:len(feature_normalized_df.columns)],
'Importance': feature_importances
})
sorted_importance_df = importance_df.sort_values(by='Importance', ascending=False)
top_n = 15
selected_aminoacids = sorted_importance_df['Amino Acid Position'][:top_n]
sorted_importance_df
| Amino Acid Position | Importance | |
|---|---|---|
| 32 | pos33:D | 0.037238 |
| 11 | pos12:G | 0.028498 |
| 34 | pos35:T | 0.028072 |
| 59 | pos60:G | 0.026595 |
| 31 | pos32:Y | 0.023720 |
| ... | ... | ... |
| 173 | pos174:P | 0.000000 |
| 172 | pos173:P | 0.000000 |
| 170 | pos171:L | 0.000000 |
| 168 | pos169:R | 0.000000 |
| 188 | pos189:S | 0.000000 |
189 rows × 2 columns
top_features = sorted_importance_df.head(top_n)
colors = cm.Set2(np.linspace(0, 1, top_n))
plt.figure(figsize=(10, 8))
bars = plt.barh(top_features['Amino Acid Position'], top_features['Importance'], color=colors)
plt.gca().invert_yaxis() # to have the most important feature at the top
plt.title('Top {} Amino Acid Positions by Importance in Van der Waals Attractive Profile'.format(top_n))
plt.xlabel('Importance')
plt.ylabel('Amino Acid Position')
plt.tight_layout()
plt.show()
colors = cm.Set2(np.linspace(0, 1, len(selected_aminoacids)))
hex_colors = [to_hex(color) for color in colors]
color_dict = dict(zip(selected_aminoacids, hex_colors))
plt.figure(figsize=(15, 10))
for position in selected_aminoacids:
sns.distplot(X[position], label=position, hist=False, color=color_dict[position])
plt.title('Distribution of Selected Amino Acid Positions')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.show()
C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning) C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning) C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning) C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning) C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning) C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning) C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning) C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning) C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning) C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning) C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning) C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning) C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning) C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning) C:\Users\Lympha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). warnings.warn(msg, FutureWarning)
selected_correlation_matrix = correlation_matrix.loc[selected_aminoacids, selected_aminoacids]
plt.figure(figsize=(10, 8))
sns.heatmap(selected_correlation_matrix, cmap="coolwarm", annot=True, vmin=-1, vmax=1, cbar_kws={'label': 'Correlation'})
plt.title('Correlation Matrix of Important Amino Acid Positions')
plt.show()
plt.figure(figsize=(10, 6))
for idx, position in enumerate(selected_aminoacids):
plt.subplot(3, 5, idx+1)
sns.boxplot(x=y_factorized, y=X[position])
plt.title(f'{position}:{round(sorted_importance_df.iloc[idx, sorted_importance_df.columns.get_loc("Importance")], 4)}')
plt.xlabel('Activity Status')
plt.ylabel('Value')
plt.tight_layout()
plt.show()
melted_data_selected = pd.melt(X[selected_aminoacids], value_vars=selected_aminoacids)
melted_data_selected['Activity Status'] = np.tile(y, len(selected_aminoacids))
plt.figure(figsize=(10, 6))
sns.violinplot(x="variable", y="value", hue="Activity Status", data=melted_data_selected, split=True, inner="quart", palette={"active": "red", "inactive": "blue"})
plt.title('Violin Plot for Selected Amino Acid Positions')
plt.xlabel('Amino Acid Position')
plt.ylabel('Value')
plt.legend(title='Activity Status')
plt.tight_layout()
plt.show()
active_data = X[y == "active"][selected_aminoacids]
inactive_data = X[y == "inactive"][selected_aminoacids]
t_stats = []
p_values = []
for position in selected_aminoacids:
t_stat, p_value = ttest_ind(active_data[position], inactive_data[position])
t_stats.append(t_stat)
p_values.append(p_value)
t_test_results = pd.DataFrame({
'Amino Acid Position': selected_aminoacids,
'T-Statistic': t_stats,
'P-Value': p_values
})
t_test_results
| Amino Acid Position | T-Statistic | P-Value | |
|---|---|---|---|
| 32 | pos33:D | 8.121724 | 6.537149e-15 |
| 11 | pos12:G | -2.274193 | 2.351280e-02 |
| 34 | pos35:T | -3.749601 | 2.047636e-04 |
| 59 | pos60:G | -1.735855 | 8.340213e-02 |
| 31 | pos32:Y | 2.828288 | 4.928337e-03 |
| 35 | pos36:I | 3.551552 | 4.312112e-04 |
| 9 | pos10:G | -0.910838 | 3.629594e-01 |
| 38 | pos39:S | 3.826063 | 1.522440e-04 |
| 12 | pos13:G | -1.411765 | 1.588397e-01 |
| 63 | pos64:Y | -6.359388 | 5.834388e-10 |
| 58 | pos59:A | -2.887402 | 4.107061e-03 |
| 15 | pos16:K | -3.824618 | 1.531060e-04 |
| 71 | pos72:M | -5.600230 | 4.118387e-08 |
| 16 | pos17:S | 2.597354 | 9.760417e-03 |
| 56 | pos57:D | -2.966500 | 3.203013e-03 |
colors = cm.Set2(np.linspace(0, 1, top_n))
fig, ax1 = plt.subplots(figsize=(12, 8))
bars = ax1.bar(t_test_results['Amino Acid Position'], t_test_results['T-Statistic'], color=colors, label='T-Statistic')
ax2 = ax1.twinx()
ax2.scatter(t_test_results['Amino Acid Position'], t_test_results['P-Value'], color='red', marker='o', label='P-Value')
ax2.axhline(y=0.05, color='black', linestyle='--') # significance threshold
ax2.set_ylim(0, 1)
ax1.set_ylabel('T-Statistic')
ax2.set_ylabel('P-Value', color='red')
ax1.set_xlabel('Amino Acid Position')
ax1.set_title(f'T-Test Results for Top {top_n} Amino Acid Positions')
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')
plt.tight_layout()
plt.show()
bonferroni_corrected_pvalues = multipletests(t_test_results['P-Value'], method='bonferroni')[1]
fdr_corrected_pvalues = multipletests(t_test_results['P-Value'], method='fdr_bh')[1]
t_test_results['Bonferroni Corrected P-Value'] = bonferroni_corrected_pvalues
t_test_results['FDR Corrected P-Value'] = fdr_corrected_pvalues
t_test_results
| Amino Acid Position | T-Statistic | P-Value | Bonferroni Corrected P-Value | FDR Corrected P-Value | |
|---|---|---|---|---|---|
| 32 | pos33:D | 8.121724 | 6.537149e-15 | 9.805724e-14 | 9.805724e-14 |
| 11 | pos12:G | -2.274193 | 2.351280e-02 | 3.526920e-01 | 2.939100e-02 |
| 34 | pos35:T | -3.749601 | 2.047636e-04 | 3.071455e-03 | 5.119091e-04 |
| 59 | pos60:G | -1.735855 | 8.340213e-02 | 1.000000e+00 | 9.623323e-02 |
| 31 | pos32:Y | 2.828288 | 4.928337e-03 | 7.392506e-02 | 7.392506e-03 |
| 35 | pos36:I | 3.551552 | 4.312112e-04 | 6.468168e-03 | 9.240239e-04 |
| 9 | pos10:G | -0.910838 | 3.629594e-01 | 1.000000e+00 | 3.629594e-01 |
| 38 | pos39:S | 3.826063 | 1.522440e-04 | 2.283659e-03 | 4.593180e-04 |
| 12 | pos13:G | -1.411765 | 1.588397e-01 | 1.000000e+00 | 1.701854e-01 |
| 63 | pos64:Y | -6.359388 | 5.834388e-10 | 8.751582e-09 | 4.375791e-09 |
| 58 | pos59:A | -2.887402 | 4.107061e-03 | 6.160591e-02 | 6.845101e-03 |
| 15 | pos16:K | -3.824618 | 1.531060e-04 | 2.296590e-03 | 4.593180e-04 |
| 71 | pos72:M | -5.600230 | 4.118387e-08 | 6.177580e-07 | 2.059193e-07 |
| 16 | pos17:S | 2.597354 | 9.760417e-03 | 1.464063e-01 | 1.330966e-02 |
| 56 | pos57:D | -2.966500 | 3.203013e-03 | 4.804520e-02 | 6.005650e-03 |
t_test_results.to_clipboard()
colors = cm.Set2(np.linspace(0, 1, top_n))
fig, ax1 = plt.subplots(figsize=(12,8))
bars = ax1.bar(t_test_results['Amino Acid Position'], t_test_results['T-Statistic'], color=colors, label='T-Statistic')
ax2 = ax1.twinx()
ax2.scatter(t_test_results['Amino Acid Position'], t_test_results['FDR Corrected P-Value'], color='red', marker='o', label='FDR Corrected P-Value')
ax2.axhline(y=0.05, color='black', linestyle='--') # significance threshold
ax2.set_ylim(0, 1)
ax1.set_ylabel('T-Statistic')
ax2.set_ylabel('P-Value', color='red')
ax1.set_xlabel('Amino Acid Position')
ax1.set_title(f'T-Test Results for Top {top_n} Amino Acid Positions in Van der Waals Attractive Profile')
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')
plt.tight_layout()
plt.show()